library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
feature_description_original <- readxl::read_excel(
"data/feature_description.xlsx")
feature_description_original
customer_segmentation_raw <- read_csv2(
"data/customer_segmentation_test.csv",
col_types = list(col_character(), col_character(), col_character(), col_character(),
col_double(), col_double(), col_character(), col_double(), col_double(),
col_character(), col_double(), col_double(), col_character(), col_double(),
col_double(), col_character(), col_double(), col_double(), col_character(),
col_character(), col_character()),
guess_max = 400000
) %>% mutate(
`Date of Birth` = lubridate::dmy(`Date of Birth`),
Gender = as.factor(Gender),
MERCHANDISE2015 = as.factor(MERCHANDISE2015),
MERCHANDISE2016 = as.factor(MERCHANDISE2016),
MERCHANDISE2017 = as.factor(MERCHANDIESE2017),
MERCHANDISE2018 = as.factor(MERCHANDIESE2018),
MERCHANDISE2019 = as.factor(MERCHANDISE2019),
LastPaymentDate = lubridate::dmy(LastPaymentDate),
PenultimatePaymentDate = lubridate::dmy(PenultimatePaymentDate)
) %>% select(-c(MERCHANDIESE2017, MERCHANDIESE2018)) %>%
rename(DateOfBirth = `Date of Birth`,
ID =`Customer Number`)
## i Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
skimr::skim(customer_segmentation_raw)
| Name | customer_segmentation_raw |
| Number of rows | 406734 |
| Number of columns | 21 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| Date | 3 |
| factor | 6 |
| numeric | 10 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ID | 0 | 1.00 | 10 | 10 | 0 | 406734 | 0 |
| Postcode | 9176 | 0.98 | 1 | 9 | 0 | 2982 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| DateOfBirth | 155491 | 0.62 | 1902-04-21 | 2015-03-30 | 1948-03-09 | 25514 |
| LastPaymentDate | 0 | 1.00 | 2015-01-03 | 2020-02-13 | 2018-12-06 | 1361 |
| PenultimatePaymentDate | 44699 | 0.89 | 1995-12-31 | 2020-02-05 | 2017-04-12 | 5376 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Gender | 0 | 1 | FALSE | 3 | fem: 203904, mal: 183467, fam: 19363 |
| MERCHANDISE2015 | 0 | 1 | FALSE | 2 | 0: 401845, 1: 4889 |
| MERCHANDISE2016 | 0 | 1 | FALSE | 2 | 0: 401585, 1: 5149 |
| MERCHANDISE2019 | 0 | 1 | FALSE | 2 | 0: 401470, 1: 5264 |
| MERCHANDISE2017 | 0 | 1 | FALSE | 2 | 0: 402378, 1: 4356 |
| MERCHANDISE2018 | 0 | 1 | FALSE | 2 | 0: 401470, 1: 5264 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| COUNT2015 | 0 | 1 | 2.52 | 4.00 | 0 | 0 | 2 | 2 | 96.0 | ▇▁▁▁▁ |
| SUM2015 | 0 | 1 | 42.44 | 850.19 | 0 | 0 | 15 | 45 | 388113.6 | ▇▁▁▁▁ |
| COUNT2016 | 0 | 1 | 1.22 | 2.02 | 0 | 0 | 1 | 1 | 178.0 | ▇▁▁▁▁ |
| SUM2016 | 0 | 1 | 50.93 | 591.05 | 0 | 0 | 16 | 50 | 295599.8 | ▇▁▁▁▁ |
| COUNT2017 | 0 | 1 | 1.06 | 1.91 | 0 | 0 | 0 | 1 | 95.0 | ▇▁▁▁▁ |
| SUM2017 | 0 | 1 | 24.78 | 572.90 | 0 | 0 | 0 | 20 | 207134.7 | ▇▁▁▁▁ |
| COUNT2018 | 0 | 1 | 1.00 | 1.87 | 0 | 0 | 0 | 1 | 49.0 | ▇▁▁▁▁ |
| SUM2018 | 0 | 1 | 20.64 | 1552.60 | 0 | 0 | 0 | 15 | 911146.5 | ▇▁▁▁▁ |
| COUNT2019 | 0 | 1 | 0.97 | 1.79 | 0 | 0 | 0 | 1 | 31.0 | ▇▁▁▁▁ |
| SUM2019 | 0 | 1 | 46.44 | 3999.80 | 0 | 0 | 0 | 30 | 2400000.0 | ▇▁▁▁▁ |
Bin hier sehr offen für Verbesserungsvorschläge ^^
zip_code_list <- readxl::read_excel("data/PLZ_Verzeichnis-20211201.xls")
zip_code_list
customer_segmentation_with_zip <- customer_segmentation_raw %>%
left_join(zip_code_list, by = c("Postcode" = "PLZ")) %>%
select(-c(`gültig ab`, `gültig bis`, NamePLZTyp, intern_extern, adressierbar, Postfach)) %>%
drop_na(Postcode, Ort, Bundesland) %>%
mutate(Postcode = as.factor(Postcode),
Bundesland = as.factor(Bundesland))
customer_segmentation_with_zip
# here we define, which months should be understood as "christmas months" to define "XMAS_donation"
XMAS_months = c(11,
12,
1)
# this date will be used as the reference for this analysis
reference_date <- lubridate::ymd("2021-12-17")
customer_segmentation_first_prepro <- customer_segmentation_with_zip %>%
mutate(
# year of customer's birthday
year_born = lubridate::year(DateOfBirth),
# age of donors at their last donation
age_at_last_donation = lubridate::interval(DateOfBirth, LastPaymentDate) %>%
as.numeric("years") %>%
as.integer(),
generation_moniker = case_when(
year_born <= 1945 ~ "silent" ,
year_born <= 1964 ~ "boomer",
year_born <= 1980 ~ "x",
year_born <= 1996 ~ "millennial",
year_born <= 2012 ~ "z"
) %>% as_factor(),
# total number of donations over all years
COUNTtotal = COUNT2015+
COUNT2016+
COUNT2017+
COUNT2018+
COUNT2019,
# total donation amount over all years
SUMtotal = SUM2015+
SUM2016+
SUM2017+
SUM2018+
SUM2019,
# average donation amount
SUMaverage = SUMtotal / COUNTtotal,
# month of the last payment
LastPaymentMONTH = lubridate::month(LastPaymentDate) %>% as.factor(),
# month of second to last payment
PenultimatePaymentMONTH = lubridate::month(PenultimatePaymentDate) %>% as.factor(),
# year of the last payment
LastPaymentYEAR = lubridate::year(LastPaymentDate),
# year of second to last payment
PenultimatePaymentYEAR = lubridate::year(PenultimatePaymentDate),
# THIS ONE NEEDS WORK
# status as christmas donor if the last two payments were around christmas,
# but we have to tweak the time interval (is Nov to Jan too large?)
# also: what about people that only have one payment in total, that should be considered. The "maybe" status is shady at best
XMAS_donor = as_factor(case_when(LastPaymentMONTH %in% XMAS_months & PenultimatePaymentMONTH %in% XMAS_months ~ "yes",
LastPaymentMONTH %in% XMAS_months ~ "maybe",
TRUE ~ "unlikely")),
# days between last and second to last payment
donation_interval = lubridate::day(lubridate::days(LastPaymentDate - PenultimatePaymentDate)),
# days since the last payment in relation to our reference date
days_since_last_payment = as.integer(LastPaymentDate - reference_date),
# binary factor variable expressing if any merchandise was bought over the observation period (clumsily coded)
merchandise_any = as_factor(if_else(
!is.na(MERCHANDISE2015) & MERCHANDISE2015 != 0 |
!is.na(MERCHANDISE2016) & MERCHANDISE2016 != 0 |
!is.na(MERCHANDISE2017) & MERCHANDISE2017 != 0 |
!is.na(MERCHANDISE2018) & MERCHANDISE2018 != 0 |
!is.na(MERCHANDISE2019) & MERCHANDISE2019 != 0,
1,
0))) %>%
# grouping for the next mutation (num_of_donation_years)
group_by(ID) %>%
# number of years in which anything was donated (0-5)
mutate(num_of_donation_years = sum(COUNT2015 > 0,
COUNT2016 > 0,
COUNT2017 > 0,
COUNT2018 > 0,
COUNT2019 > 0, na.rm=T)) %>%
# ungrouping is important! ;)
# I learned that skimr tries to show its output based on groups if working with a grouped dataset... that crashed my computer twice ^^
ungroup() %>%
# remove variables that have no further use or
select(-c(ID, DateOfBirth, LastPaymentDate, PenultimatePaymentDate))
customer_segmentation_first_prepro
customer_segmentation_first_prepro %>% skimr::skim()
| Name | Piped data |
| Number of rows | 396694 |
| Number of columns | 34 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| factor | 13 |
| numeric | 20 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Ort | 0 | 1 | 2 | 40 | 0 | 2178 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Gender | 0 | 1.00 | FALSE | 3 | fem: 199545, mal: 179215, fam: 17934 |
| Postcode | 0 | 1.00 | FALSE | 2249 | 122: 6776, 121: 6208, 110: 5941, 502: 5383 |
| MERCHANDISE2015 | 0 | 1.00 | FALSE | 2 | 0: 391818, 1: 4876 |
| MERCHANDISE2016 | 0 | 1.00 | FALSE | 2 | 0: 391552, 1: 5142 |
| MERCHANDISE2019 | 0 | 1.00 | FALSE | 2 | 0: 391460, 1: 5234 |
| MERCHANDISE2017 | 0 | 1.00 | FALSE | 2 | 0: 392339, 1: 4355 |
| MERCHANDISE2018 | 0 | 1.00 | FALSE | 2 | 0: 391460, 1: 5234 |
| Bundesland | 0 | 1.00 | FALSE | 9 | N: 88175, W: 70706, O: 66082, St: 57348 |
| generation_moniker | 146208 | 0.63 | FALSE | 5 | sil: 110508, boo: 102068, x: 33020, mil: 4734 |
| LastPaymentMONTH | 0 | 1.00 | FALSE | 12 | 12: 119035, 11: 66379, 1: 45775, 10: 42275 |
| PenultimatePaymentMONTH | 37875 | 0.90 | FALSE | 12 | 12: 91203, 11: 56900, 10: 42674, 1: 27463 |
| XMAS_donor | 0 | 1.00 | FALSE | 3 | unl: 165505, may: 119746, yes: 111443 |
| merchandise_any | 0 | 1.00 | FALSE | 2 | 0: 377620, 1: 19074 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| COUNT2015 | 0 | 1.00 | 2.56 | 4.03 | 0.00 | 0.00 | 2.00 | 4.00 | 96.0 | ▇▁▁▁▁ |
| SUM2015 | 0 | 1.00 | 41.12 | 724.36 | 0.00 | 0.00 | 15.00 | 45.00 | 388113.6 | ▇▁▁▁▁ |
| COUNT2016 | 0 | 1.00 | 1.24 | 2.03 | 0.00 | 0.00 | 1.00 | 1.00 | 178.0 | ▇▁▁▁▁ |
| SUM2016 | 0 | 1.00 | 51.20 | 596.95 | 0.00 | 0.00 | 20.00 | 50.00 | 295599.8 | ▇▁▁▁▁ |
| COUNT2017 | 0 | 1.00 | 1.08 | 1.92 | 0.00 | 0.00 | 0.00 | 1.00 | 95.0 | ▇▁▁▁▁ |
| SUM2017 | 0 | 1.00 | 24.45 | 484.85 | 0.00 | 0.00 | 0.00 | 20.00 | 207134.7 | ▇▁▁▁▁ |
| COUNT2018 | 0 | 1.00 | 1.02 | 1.88 | 0.00 | 0.00 | 0.00 | 1.00 | 49.0 | ▇▁▁▁▁ |
| SUM2018 | 0 | 1.00 | 20.76 | 1570.91 | 0.00 | 0.00 | 0.00 | 15.00 | 911146.5 | ▇▁▁▁▁ |
| COUNT2019 | 0 | 1.00 | 0.98 | 1.80 | 0.00 | 0.00 | 0.00 | 1.00 | 31.0 | ▇▁▁▁▁ |
| SUM2019 | 0 | 1.00 | 46.90 | 4049.95 | 0.00 | 0.00 | 0.00 | 30.00 | 2400000.0 | ▇▁▁▁▁ |
| year_born | 146204 | 0.63 | 1949.25 | 14.01 | 1902.00 | 1939.00 | 1948.00 | 1959.00 | 2015.0 | ▁▇▇▂▁ |
| age_at_last_donation | 146204 | 0.63 | 68.33 | 14.00 | 0.00 | 59.00 | 70.00 | 79.00 | 117.0 | ▁▁▇▇▁ |
| COUNTtotal | 0 | 1.00 | 6.87 | 9.93 | 1.00 | 2.00 | 3.00 | 7.00 | 273.0 | ▇▁▁▁▁ |
| SUMtotal | 0 | 1.00 | 184.43 | 4898.70 | 0.01 | 30.00 | 65.00 | 160.00 | 2400225.0 | ▇▁▁▁▁ |
| SUMaverage | 0 | 1.00 | 36.08 | 1530.61 | 0.01 | 11.25 | 17.34 | 29.42 | 750000.0 | ▇▁▁▁▁ |
| LastPaymentYEAR | 0 | 1.00 | 2017.78 | 1.53 | 2015.00 | 2016.00 | 2018.00 | 2019.00 | 2020.0 | ▅▂▃▇▂ |
| PenultimatePaymentYEAR | 37875 | 0.90 | 2015.72 | 3.91 | 1995.00 | 2015.00 | 2017.00 | 2018.00 | 2020.0 | ▁▁▁▃▇ |
| donation_interval | 37875 | 0.90 | 773.66 | 1215.88 | 1.00 | 123.00 | 354.00 | 762.00 | 8766.0 | ▇▁▁▁▁ |
| days_since_last_payment | 0 | 1.00 | -1293.24 | 561.24 | -2540.00 | -1814.00 | -1102.00 | -762.00 | -673.0 | ▂▂▂▃▇ |
| num_of_donation_years | 0 | 1.00 | 2.50 | 1.49 | 1.00 | 1.00 | 2.00 | 4.00 | 5.0 | ▇▅▃▂▃ |
#Maybe it's a good idea to take out all the NAs for age. Obviously we lose a lot of rows, but 251000 left still seems plenty to me.
customer_segmentation_complete <- customer_segmentation_first_prepro %>% drop_na(year_born)
customer_segmentation_complete
ggplot(customer_segmentation_first_prepro, aes(XMAS_donor)) +
geom_bar() +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro, aes(num_of_donation_years)) +
geom_bar() +
facet_wrap(~generation_moniker)
ggplot(customer_segmentation_first_prepro %>% drop_na(age_at_last_donation), aes(age_at_last_donation)) +
geom_histogram(binwidth = 5)
ggplot(customer_segmentation_first_prepro %>% filter(SUMtotal > 0 & SUMtotal < 5000), aes(x = SUMtotal)) +
geom_histogram(binwidth = 100) +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro, aes(LastPaymentMONTH)) +
geom_bar() +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro, aes(PenultimatePaymentMONTH)) +
geom_bar() +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro %>% filter(COUNTtotal < (7 * 6)), aes(COUNTtotal)) +
geom_histogram(binwidth = 1)
ggplot(customer_segmentation_first_prepro %>% drop_na(donation_interval) %>% filter(donation_interval < (360 * 5)), aes(donation_interval)) +
geom_histogram(binwidth = 30)
mean_total_sum <- customer_segmentation_first_prepro$SUMtotal %>% mean(na.rm = TRUE)
sd_total_sum <- customer_segmentation_first_prepro$SUMtotal %>% sd(na.rm = TRUE)
ggplot(customer_segmentation_first_prepro %>% drop_na(year_born) %>% filter(SUMtotal < (mean_total_sum + sd_total_sum * 6)), aes(year_born, SUMtotal)) +
geom_point(alpha = 1 / 10)
# taken from https://de.statista.com/statistik/daten/studie/75396/umfrage/entwicklung-der-bevoelkerung-in-oesterreich-nach-bundesland-seit-1996/
pop_vienna <- 1921153
pop_lower_austria <- 1691040
pop_upper_austria <- 1495756
pop_styria <- 1247159
pop_tyrol <- 760161
pop_carithia <- 562230
pop_salzburg <- 560643
pop_vorarlberg <- 399164
pop_burgenland <- 296040
donors_per_state_per_100_000_inhabitants <- customer_segmentation_first_prepro %>%
select(Bundesland) %>%
group_by(Bundesland) %>%
count() %>%
ungroup() %>%
mutate(
n = case_when(
Bundesland == "B" ~ n / pop_burgenland * 100000,
Bundesland == "K" ~ n / pop_carithia * 100000,
Bundesland == "N" ~ n / pop_lower_austria * 100000,
Bundesland == "O" ~ n / pop_upper_austria * 100000,
Bundesland == "Sa" ~ n / pop_salzburg * 100000,
Bundesland == "St" ~ n / pop_styria * 100000,
Bundesland == "T" ~ n / pop_tyrol * 100000,
Bundesland == "V" ~ n / pop_vorarlberg * 100000,
Bundesland == "W" ~ n / pop_vienna * 100000
)
)
ggplot(donors_per_state_per_100_000_inhabitants, aes(Bundesland, n)) +
geom_col()
ggplot(customer_segmentation_first_prepro, aes(days_since_last_payment)) +
geom_histogram(binwidth = 30)
RFM segments customers according to three variabless: Recency, Frequency, Monetary Value. Using the rfm package, RFM scores can be computed either on raw transaction data (one row per transaction), or on aggregated customer data (one row per customer). For the former, the function rfm_table_order can be used, for the latter either rfm_table_customer or rfm_table_customer2. Since our dataset represents aggregated customer data, the latter should be used. It can be computer directly from the raw data:
library(rfm)
rfm_scores <- customer_segmentation_raw %>%
# create new variables: total donation sum; total number of donations
mutate(SUMtotal = SUM2015 + SUM2016 + SUM2017 + SUM2018 + SUM2019,
COUNTtotal = COUNT2015 + COUNT2016 + COUNT2017 + COUNT2018 + COUNT2019,
LastPaymentDate = as.Date(LastPaymentDate)) %>%
# compute RFM scores
rfm_table_customer_2(customer_id = ID,
n_transactions = COUNTtotal,
latest_visit_date = LastPaymentDate,
total_revenue = SUMtotal,
analysis_date = reference_date)
rfm_scores
## Warning in `[<-.data.frame`(`*tmp*`, is_list, value = list(`1` =
## "<tibble[,8]>", : replacement element 1 has 1 row to replace 0 rows
## Warning in `[<-.data.frame`(`*tmp*`, is_list, value = list(`1` =
## "<tibble[,8]>", : replacement element 2 has 1 row to replace 0 rows
Visual inspection of RFM scores:
We can see that higher monetary values are characterized by higher donation frequencies and more recent donations. There is an obvious cluster of low monetary value for frequency values in [1,2] and recency in [1,3]. These might be ‘sleepers’, i.e. customers who donated only a few times and are not active donors any more. In the upper left corner, we see very unrecent customers who donated above average for this recency score. It might be wort focusing on them, since they could be “reactivated” as donors, since they showed above-average donation willingness among low-frequency donors. Note: The higher the recency score, the more recent the last transaction!
rfm_heatmap(rfm_scores)
rfm_bar_chart(rfm_scores)
In the frequency vs monetary value plot below there is no strong correlation between donation frequency and monetary value. However, in the low-frequency area, there seems to be a slight positive correlation with monetary value.
rfm_fm_plot(rfm_scores)